Contents

library(tidyverse)
library(magrittr)
library(tximeta)
library(DESeq2)
library(UpSetR)
library(pheatmap)
library(RColorBrewer)
library(ggfortify)
library(glmnet)
library(EnhancedVolcano)
library(umap)
library(biomaRt)
#library(ggbeeswarm)

This markdown is based on this vignette and describes the processing of the FASTQ files from the RNAseq data of Fleischer et al., as well as performing a differential expression analysis. The differentially expressed genes are then used for building Steiner trees.

1 FASTQ files

The FASTQ files from the study from Fleischer et al. were downloaded from the European Nucleotide Archive ENA. The used accession numbers were: SRR7093809 - SRR7093951

2 Transcript quantification with Salmon

For the transcript quantification the tool salmon was used according to this vignette. For the quantification, first a reference transcriptome is needed for the alignment. A human reference transcriptome can be downloaded from ensemblgenomes. Release 105 was used instead of the newest one 106 because tximeta only works for versions up to 105 at the moment. Then salmon was used to build an index on that transcriptome (salmon index -t ../genome_data/Homo_sapiens.GRCh38.cdna.all.fa.gz -i grch38_index). Finally, the samples were quantified with: salmon quant -i grch38_index -l A -r /mnt/jana/fastq/{sample}.fastq.gz -p 8 –validateMappings -o quants/{sample}_quant. This command produces an output folder for each sample that contains the quant.sf file, which includes the count values, used in the following.

3 Obtaining count data

3.1 Reading in data with tximeta

#20-bins
age_groups <- data.frame(Age = as.character(seq(1, 96, 1)), 
                         age_group = c(rep('1-19', 19), rep('20-39', 20), 
                                       rep('40-59', 20), rep('60-79', 20),
                                       rep('80-96', 17)))
#same number of patients per bin
age_groups <- data.frame(Age = as.character(seq(1, 96, 1)), 
                         age_group = c(rep('1-19', 19), rep('20-35', 16), 
                                       rep('36-61', 26), rep('62-84', 23),
                                       rep('85-96', 12)))

#small-big-small
age_groups <- data.frame(Age = as.character(seq(1, 96, 1)), 
                         age_group = c(rep('1-9', 9), rep('10-29', 20), 
                                       rep('30-59', 30), rep('60-79', 20),
                                       rep('80-89', 10), rep('90-96', 7)))

age_groups <- data.frame(Age = as.character(seq(1, 96, 1)), 
                         age_group = c(rep('1-15', 15), rep('16-37', 22), 
                                      rep('38-57', 20), rep('58-85', 28),
                                      rep('86-96', 11)))

age_groups <- data.frame(Age = as.character(seq(1, 96, 1)), 
                         age_group = c(rep('1-15', 15), rep('16-26', 11), 
                                      rep('27-60', 34), rep('61-85', 25),
                                      rep('86-96', 11)))
# get info about the runs
runs_info <- read.delim(paste0(dir, "Data/SraRunTable.txt"), sep = ",") %>%
  mutate(Age = gsub("y[rs]\\d+mos", "", Age)) %>%
  mutate(Age = gsub("yr", "", Age)) %>%
  left_join(age_groups) %>%
  mutate(age_group = factor(age_group, levels = unique(age_groups$age_group))) %>%
  filter(disease == "Normal") 
## Joining, by = "Age"
  #filter(sex == "male")
head(runs_info, n = 3)
##          Run Age Assay.Type AvgSpotLen      Bases  BioProject    BioSample
## 1 SRR7093810  12    RNA-Seq         75 1538664000 PRJNA454681 SAMN09011820
## 2 SRR7093812  25    RNA-Seq         75 1495228725 PRJNA454681 SAMN09011818
## 3 SRR7093815  26    RNA-Seq         75 1595644800 PRJNA454681 SAMN09011815
##       Bytes cell_id Center.Name Consent DATASTORE.filetype DATASTORE.provider
## 1 574146143 AG16409         GEO  public          sra,fastq         ncbi,gs,s3
## 2 540157644 AG09975         GEO  public          fastq,sra         gs,s3,ncbi
## 3 600307232 AG07124         GEO  public          sra,fastq         gs,ncbi,s3
##                 DATASTORE.region disease Experiment GEO_Accession..exp.
## 1 s3.us-east-1,gs.US,ncbi.public  Normal SRX4022457          GSM3124561
## 2 gs.US,s3.us-east-1,ncbi.public  Normal SRX4022459          GSM3124563
## 3 gs.US,ncbi.public,s3.us-east-1  Normal SRX4022462          GSM3124566
##    Instrument LibraryLayout LibrarySelection  LibrarySource     Organism
## 1 NextSeq 500        SINGLE             cDNA TRANSCRIPTOMIC Homo sapiens
## 2 NextSeq 500        SINGLE             cDNA TRANSCRIPTOMIC Homo sapiens
## 3 NextSeq 500        SINGLE             cDNA TRANSCRIPTOMIC Homo sapiens
##   Platform          ReleaseDate Sample.Name    sex SRA.Study       source_name
## 1 ILLUMINA 2018-11-17T00:00:00Z  GSM3124561   male SRP144355 Skin; Unspecified
## 2 ILLUMINA 2018-11-17T00:00:00Z  GSM3124563   male SRP144355         Skin; Arm
## 3 ILLUMINA 2018-11-17T00:00:00Z  GSM3124566 female SRP144355         Skin; Arm
##   Ethnicity age_group
## 1 Caucasian      1-15
## 2 caucasian     16-26
## 3 Caucasian     16-26
runs_info$names <- runs_info$Run
runs_info$files <- paste0(dir, "Data/rna_counts/", runs_info$names, "_quant/quant.sf")
#file.exists(runs_info$files)
runs_info %<>% mutate(Age = as.numeric(Age)) %>%
  mutate(Sex = ifelse(sex == 'male', 'Male', 'Female'))

ggplot(runs_info, aes(x = Age, fill = Sex)) + 
  geom_bar() +
  theme_bw() + 
  xlab('Age') + 
  ylab('Number of individuals') +
  theme(text = element_text(size = 18))

ggplot(runs_info, aes(x = age_group)) +
  geom_bar() +
  theme_bw() + 
  ylab('number of individuals') +
  theme(text = element_text(size = 17))

# create summarized experiment with tximeta
se <- tximeta(runs_info)
## importing quantifications
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 
## found matching transcriptome:
## [ Ensembl - Homo sapiens - release 105 ]
## loading existing EnsDb created: 2022-10-25 14:37:43
## loading existing transcript ranges created: 2022-10-25 14:37:49
# convert from ensembl transcript id to gene id
gse <- summarizeToGene(se)
## loading existing EnsDb created: 2022-10-25 14:37:43
## obtaining transcript-to-gene mapping from database
## loading existing gene ranges created: 2022-10-25 14:37:58
## summarizing abundance
## summarizing counts
## summarizing length
gse <- gse[rowData(gse)$gene_biotype == "protein_coding", ]
dim(assay(gse))
## [1] 21871   133
# filter for genes that have counts of at least 10 in at least 5 samples
keep <- rowSums(assay(gse) >= 10) >= 5
gse <- gse[keep,]

#keep2 <- rowMax(assay(gse)) /rowMin(assay(gse)) > 5
#gse <- gse[keep2,]

assay(gse)[1:5, 1:5]
##                 SRR7093810 SRR7093812 SRR7093815 SRR7093817 SRR7093819
## ENSG00000000003    357.013    450.912    588.386    557.761    592.659
## ENSG00000000419    622.399    648.336    690.053    635.352    836.884
## ENSG00000000457    218.196    179.125    191.908    226.061    272.195
## ENSG00000000460    121.005    256.007    357.007    358.488    543.658
## ENSG00000000971    175.001    367.989    369.000    624.000    177.999
dim(assay(gse))
## [1] 15337   133

4 Differential expression analysis with DESeq2

dds <- DESeqDataSet(gse, design = ~ age_group)
## using counts and average transcript lengths from tximeta
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
# differential expression analysis
dds <- DESeq(dds)
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 221 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
foldchanges <- lapply(seq(1, length(unique(age_groups$age_group))-1, 1), function(ix) {
    t0 = unique(age_groups$age_group)[ix]
    t1 <- unique(age_groups$age_group)[ix+1]
    # calculate log2 foldchange between two age intervals 
    res <- results(dds, contrast=c("age_group", t1, t0))
    res$gene <- rowData(dds)$symbol
    res$transition <- paste0('fc_', t0, '_', t1)
    res <- subset(res, select = c(gene, transition, log2FoldChange, pvalue, padj)) %>%
      as.data.frame() %>%
      rownames_to_column(var = "ensembl_id") %>%
      filter(gene != "")
  return(res)}) %>%
  bind_rows() %>%
  mutate(abs_log2_fc = abs(log2FoldChange)) %>%
  arrange(desc(abs_log2_fc)) %>%
  drop_na(padj)

head(foldchanges)
##        ensembl_id    gene     transition log2FoldChange       pvalue
## 1 ENSG00000154839    SKA1 fc_16-26_27-60       50.00218 3.105491e-68
## 2 ENSG00000228177 C6orf47 fc_16-26_27-60       42.60803 2.140504e-20
## 3 ENSG00000278259   MYO19 fc_61-85_86-96       38.90985 5.526890e-70
## 4 ENSG00000278259   MYO19 fc_27-60_61-85      -36.81495 2.630264e-86
## 5 ENSG00000276023  DUSP14 fc_61-85_86-96       33.20129 6.807138e-25
## 6 ENSG00000204592   HLA-E fc_61-85_86-96       32.25490 3.287560e-12
##           padj abs_log2_fc
## 1 4.762580e-64    50.00218
## 2 6.565354e-17    42.60803
## 3 8.476039e-66    38.90985
## 4 4.033773e-82    36.81495
## 5 6.228426e-23    33.20129
## 6 3.429798e-11    32.25490

Save all considered genes (the ones with counts >= 10 in at least 5 samples) with ensembl id and gene symbol

all_genes <- subset(foldchanges, select = c(ensembl_id, gene)) %>%
  distinct() 

write.csv(all_genes, paste0(dir, 'Data/all_genes.csv'), row.names = FALSE)

4.1 Number of DE genes per transition

group_by(foldchanges, transition) %>%
  summarize("p<0.01" = length(which(padj < 0.01)),
            "p<0.05" = length(which(padj < 0.05)),
            "p<0.1" = length(which(padj < 0.1)),
            "p<0.2" = length(which(padj < 0.2)))
## # A tibble: 4 × 5
##   transition     `p<0.01` `p<0.05` `p<0.1` `p<0.2`
##   <chr>             <int>    <int>   <int>   <int>
## 1 fc_1-15_16-26       126      414     745    1545
## 2 fc_16-26_27-60       28      142     336     631
## 3 fc_27-60_61-85       48      161     327     834
## 4 fc_61-85_86-96     7034     8490    9308   10309

4.2 Robustness of selected DE genes

DE_subsampling <- read.delim(paste0(dir, "Data/subsampling_DE_0.1.csv"), sep = ",") %>%
  group_by(transition, gene) %>%
  summarise(n = n()) %>%
  arrange(desc(n)) %>%
  group_by(transition) %>% 
  mutate(rank = rank(-n, ties.method = 'first'))
## `summarise()` has grouped output by 'transition'. You can override using the
## `.groups` argument.
names <- data.frame(transition = c("fc_1-15_16-26", "fc_16-26_27-60", "fc_27-60_61-85", "fc_61-85_86-96"), 
                    group = c("Group 1 vs. Group 2", "Group 2 vs. Group 3", "Group 3 vs. Group 4", "Group 4 vs. Group 5"))

DE_subsampling %<>% left_join(names)
## Joining, by = "transition"
# How often does each gene occur in the simulations?
ggplot(DE_subsampling, aes(x = as.numeric(rank), y = n)) + 
  geom_line() + 
  facet_wrap(~group, scales = "free_x", nrow = 1) +
  theme_bw() +
  xlab("ranked genes") +
  ylab("% subsamples \n with p < 0.1") +
  theme(text = element_text(size = 18))

filter(DE_subsampling, n > 25) %>% summarize(n = n())
## # A tibble: 4 × 2
##   transition         n
##   <chr>          <int>
## 1 fc_1-15_16-26    630
## 2 fc_16-26_27-60   229
## 3 fc_27-60_61-85   260
## 4 fc_61-85_86-96  9381

4.3 Save results

# Filtering: such that around 200 robust & significant DE genes per transition 
thresholds <- data.frame('transition' = c('fc_1-15_16-26', 'fc_16-26_27-60', 
                                           'fc_27-60_61-85', 'fc_61-85_86-96'), 
                          'p_threshold' = c(0.05, 0.1, 0.1, 1e-17), 
                          'n_threshold' = c(50, 25, 20, 99), 
                          'fc_threshold' = c(0.6, 0.4, 0.4, 1.6))

foldchanges_filtered <- subset(foldchanges, select = -c(pvalue)) %>% 
  left_join(DE_subsampling, by = c('gene', 'transition')) %>%
  left_join(thresholds) %>%
  filter(padj < p_threshold & n > n_threshold & abs_log2_fc > fc_threshold)
## Joining, by = "transition"
group_by(foldchanges_filtered, transition) %>% summarize(n = n())
## # A tibble: 4 × 2
##   transition         n
##   <chr>          <int>
## 1 fc_1-15_16-26    182
## 2 fc_16-26_27-60   162
## 3 fc_27-60_61-85   163
## 4 fc_61-85_86-96   184
# Save the results for various thresholds on the adjusted p-value
p_0.05 <- foldchanges_filtered %>%
  group_by(gene, transition) %>%  
  # mean for multiple ensembl-ids belonging to the same gene
  summarize(log2FoldChange = mean(log2FoldChange)) %>%
  mutate(abs_log2_fc = abs(log2FoldChange)) %>%
  mutate(updown = ifelse(log2FoldChange > 0, 'up', 'down'))
## `summarise()` has grouped output by 'gene'. You can override using the
## `.groups` argument.
write.csv(p_0.05, paste0(dir, 'Data/DE_updown.csv'), row.names = FALSE)
write.csv(subset(p_0.05, select = c(gene, transition, abs_log2_fc)), 
          paste0(dir, "Data/DE_var_p_n_200.csv"), row.names = FALSE)

Select example genes with highest absolute log2 fold change for a given transition:

filter(foldchanges_filtered, transition == "fc_27-60_61-85") %>%
  arrange(desc(abs_log2_fc)) %>%
  head(n = 5)
##        ensembl_id   gene     transition log2FoldChange         padj abs_log2_fc
## 1 ENSG00000278259  MYO19 fc_27-60_61-85     -36.814945 4.033773e-82   36.814945
## 2 ENSG00000204227  RING1 fc_27-60_61-85     -25.978800 4.799849e-56   25.978800
## 3 ENSG00000157870 PRXL2B fc_27-60_61-85     -24.293853 1.065110e-28   24.293853
## 4 ENSG00000179772  FOXS1 fc_27-60_61-85       3.474460 4.540675e-02    3.474460
## 5 ENSG00000174697    LEP fc_27-60_61-85       3.338923 1.808030e-05    3.338923
##     n rank               group p_threshold n_threshold fc_threshold
## 1  99    4 Group 3 vs. Group 4         0.1          20          0.4
## 2 100    3 Group 3 vs. Group 4         0.1          20          0.4
## 3  91   12 Group 3 vs. Group 4         0.1          20          0.4
## 4  47  114 Group 3 vs. Group 4         0.1          20          0.4
## 5  51   98 Group 3 vs. Group 4         0.1          20          0.4

4.4 Extract counts

coldata <- as.data.frame(subset(colData(dds), select = c(names, Age, age_group, sex)))

count_data <- fpkm(dds) %>%
  as.data.frame() %>%
  rownames_to_column(var = "ensembl_id") %>%
  mutate(gene = rowData(dds)$symbol) %>%
  gather(key = "names", value = "counts", -c(ensembl_id, gene)) %>%
  left_join(coldata, by = "names") %>%
  filter(gene != "")
write.csv(count_data, paste0(dir, "Data/count_data.csv"), row.names = FALSE)
head(count_data)
##        ensembl_id     gene      names    counts Age age_group  sex
## 1 ENSG00000000003   TSPAN6 SRR7093810  8.675290  12      1-15 male
## 2 ENSG00000000419     DPM1 SRR7093810 35.885730  12      1-15 male
## 3 ENSG00000000457    SCYL3 SRR7093810  3.334175  12      1-15 male
## 4 ENSG00000000460 C1orf112 SRR7093810  2.652753  12      1-15 male
## 5 ENSG00000000971      CFH SRR7093810  3.709251  12      1-15 male
## 6 ENSG00000001036    FUCA2 SRR7093810 53.758105  12      1-15 male
# Mean counts per age group
mean_counts <- count_data %>% group_by(ensembl_id, gene, age_group) %>%
  summarize(counts = mean(counts)) %>%
  ungroup()
## `summarise()` has grouped output by 'ensembl_id', 'gene'. You can override
## using the `.groups` argument.
write.csv(mean_counts, paste0(dir, "Data/mean_counts.csv"), row.names = FALSE)
head(mean_counts)
## # A tibble: 6 × 4
##   ensembl_id      gene   age_group counts
##   <chr>           <chr>  <fct>      <dbl>
## 1 ENSG00000000003 TSPAN6 1-15        9.38
## 2 ENSG00000000003 TSPAN6 16-26      12.1 
## 3 ENSG00000000003 TSPAN6 27-60      12.5 
## 4 ENSG00000000003 TSPAN6 61-85      11.4 
## 5 ENSG00000000003 TSPAN6 86-96      16.5 
## 6 ENSG00000000419 DPM1   1-15       36.9
# get activities of all genes per age group (based on FPKM values)
fpkm <- fpkm(dds) %>% 
  as.data.frame() %>%
  mutate(gene = rowData(dds)$symbol) %>%
  rownames_to_column(var = "ensembl_gene_id") %>%
  gather(key = "names", value = "counts", -c(ensembl_gene_id, gene)) %>%
  left_join(coldata, by = "names") %>%
  filter(gene != "") %>%
  group_by(gene, age_group) %>%
  summarize(counts = mean(counts)) %>%
  mutate(activity = ifelse(counts > 0.8, 'active', 'inactive'))
## `summarise()` has grouped output by 'gene'. You can override using the
## `.groups` argument.
fpkm_plot <- fpkm %>%
  mutate(age_group = recode(age_group, "1-15" = "Group 1", "16-26" = "Group 2", 
                            "27-60" = "Group 3", "61-85" = "Group 4", "86-96" = "Group 5"))

ggplot(fpkm_plot, aes(x = log(counts+1))) +
  geom_histogram(bins = 40) +
  geom_vline(xintercept = log(0.8 + 1), color = "red") +
  facet_wrap(~age_group, nrow = 1) +
  theme_bw() +
  theme(text = element_text(size = 22)) + 
  xlab('log(FPKM + 1)')

gene_activity <- data.frame(transition = c('fc_1-15_16-26', 'fc_16-26_27-60', 
                          'fc_27-60_61-85', 'fc_61-85_86-96')) %>%
  separate(transition, c('fc', 't0', 't1'), sep = "_", remove = FALSE) %>%
  left_join(dplyr::rename(fpkm, t0 = age_group, a0 = activity, c0 = counts)) %>%
  left_join(dplyr::rename(fpkm, t1 = age_group, a1 = activity, c1 = counts)) %>%
  mutate(activity = paste0(a0, "_", a1)) %>%
  dplyr::filter(activity != 'inactive_inactive') %>%
  subset(select = c(transition, gene, activity))
## Joining, by = "t0"
## Joining, by = c("t1", "gene")
write.csv(gene_activity, paste0(dir, "Data/gene_activity.csv"), row.names = FALSE)
head(gene_activity)
##      transition   gene      activity
## 1 fc_1-15_16-26   A1BG active_active
## 2 fc_1-15_16-26    A2M active_active
## 3 fc_1-15_16-26 A4GALT active_active
## 4 fc_1-15_16-26   AAAS active_active
## 5 fc_1-15_16-26   AACS active_active
## 6 fc_1-15_16-26  AADAT active_active
#split DE genes into list of groups according to the transitions
ages <- c("1-15", "16-26", "27-60", "61-85", "86-96")

top_list <- filter(fpkm, activity == 'active') %>% 
  mutate(age_group = factor(age_group, levels = ages)) %>%
  group_by(age_group) %>% 
  group_split()
names(top_list) <- c("Group 1", "Group 2", "Group 3", "Group 4", "Group 5")
top_list <- lapply(top_list, function(list) list$gene)

upset(fromList(top_list), order.by = "freq", nsets = 9,
       mainbar.y.label = "Intersections", sets.x.label = "Active Genes",
       text.scale = c(3.5, 3.5, 3, 3, 3.5, 3.5), nintersects = 20,
       keep.order= TRUE, sets = c("Group 5", "Group 4", "Group 3",
                                  "Group 2", "Group 1"))

5 Visualizations of DE genes

5.1 Distribution of p-values

foldchanges %<>% left_join(names)
## Joining, by = "transition"
ggplot(foldchanges, aes(x = pvalue)) +
  geom_histogram() +
  facet_wrap(~group, scale = "free_y", nrow = 1) +
  xlab('p-value') +
  theme_bw() +
  theme(text = element_text(size = 18))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

5.2 Intersection of DE genes

#split DE genes into list of groups according to the transitions
transitions <- c("fc_1-15_16-26", "fc_16-26_27-60", "fc_27-60_61-85", 
                 "fc_61-85_86-96")

top_list <- p_0.05 %>% 
  mutate(transition = factor(transition, levels = transitions)) %>%
  group_by(transition) %>% 
  group_split()
names(top_list) <- c("Group 1 vs. Group 2", "Group 2 vs. Group 3", "Group 3 vs. Group 4", "Group 4 vs. Group 5")
top_list <- lapply(top_list, function(list) list$gene)

upset(fromList(top_list), order.by = "freq", nsets = 9,
       mainbar.y.label = "Intersections", sets.x.label = "DE Genes",
       text.scale = c(3.5, 3.5, 3, 3, 3.5, 3.5), nintersects = 20,
       keep.order= TRUE, sets = c("Group 4 vs. Group 5", "Group 3 vs. Group 4", 
                                  "Group 2 vs. Group 3", "Group 1 vs. Group 2"))

5.3 PCA of the samples

# variance-stabilizing transformation
vsd <- vst(dds)
#vsd <- vsd[rownames(vsd) %in% lasso$ensembl_id,]
# PCA
pca <- plotPCA(vsd, intgroup = c("age_group"), returnData = TRUE) 
percentVar <- round(100 * attr(pca, "percentVar"))
ggplot(pca, aes(PC1, PC2, color=age_group)) +
  geom_point(size=3) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) +
  coord_fixed() +
  theme_bw() +
  theme(text = element_text(size = 16)) +
  scale_colour_manual(values = c('#ffd166', '#ef476f', '#06d6a0', '#118ab2', '#073b4c'))

df_pca <- rownames_to_column(data.frame(t(assay(vsd)))) %>%
  left_join(rownames_to_column(subset(data.frame(colData(vsd)), select = c('Age', 'sex', 'source_name', 
                                                               'Ethnicity', 'age_group')))) 
## Joining, by = "rowname"
pca <- prcomp(df_pca[, 2:(ncol(df_pca)-5)])
autoplot(pca, data = df_pca, colour = "age_group", x = 1, y = 2)  + 
  theme_bw() +
  theme(text = element_text(size = 20))

# Is another PC than 1 & 2 higher correlated to age?
cor(df_pca[, 15339], pca$x[, 1])
## [1] -0.5681123
cor(df_pca[, 15339], pca$x[, 2])
## [1] 0.1992682
cor(df_pca[, 15339], pca$x[, 3])
## [1] 0.1445767
cor(df_pca[, 15339], pca$x[, 4])
## [1] -0.1405156
cor(df_pca[, 15339], pca$x[, 5])
## [1] 0.02776604
ggplot(runs_info, aes(x = Age)) + 
  geom_histogram() + 
  facet_wrap(~source_name)
set.seed(142)
umap_fit <- umap(df_pca[, 2:(ncol(df_pca)-5)])

umap_df <- umap_fit$layout %>%
  as.data.frame()%>%
  dplyr::rename(UMAP1="V1", UMAP2="V2") %>%
  mutate(sample=df_pca$rowname) %>%
  inner_join(subset(data.frame(colData(vsd)), 
                    select = c('Age', 'sex', 'source_name', 'Ethnicity', 'age_group')) %>%
               rownames_to_column(var = "sample"))
## Joining, by = "sample"
ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = age_group)) +
  geom_point() +
  theme_bw() +
  theme(text = element_text(size = 20))

# save counts with variance-stabilizing transformation for MEFISTO
count_data_vst <- assay(vsd) %>%
  as.data.frame() %>%
  rownames_to_column(var = "ensembl_id") %>%
  mutate(gene = rowData(vsd)$symbol) %>%
  gather(key = "names", value = "counts", -c(ensembl_id, gene)) %>%
  left_join(coldata, by = "names") %>%
  filter(gene != "")
write.csv(count_data_vst, paste0(dir, "MOFA/Data/vst_counts.csv"), row.names = FALSE)
write.csv(count_data_vst, paste0(dir, "Data/vst_counts.csv"), row.names = FALSE)

5.4 Gene expression of following age intervals

# Mean counts per age group
mean_counts <- count_data_vst %>% group_by(ensembl_id, gene, age_group) %>%
  summarize(counts = mean(counts)) %>%
  ungroup()
## `summarise()` has grouped output by 'ensembl_id', 'gene'. You can override
## using the `.groups` argument.
#write.csv(mean_counts, paste0(dir, "Data/mean_counts.csv"), row.names = FALSE)
head(mean_counts)
## # A tibble: 6 × 4
##   ensembl_id      gene   age_group counts
##   <chr>           <chr>  <fct>      <dbl>
## 1 ENSG00000000003 TSPAN6 1-15        8.74
## 2 ENSG00000000003 TSPAN6 16-26       9.08
## 3 ENSG00000000003 TSPAN6 27-60       9.12
## 4 ENSG00000000003 TSPAN6 61-85       9.01
## 5 ENSG00000000003 TSPAN6 86-96       9.46
## 6 ENSG00000000419 DPM1   1-15        9.48
groups = data.frame(transition = transitions, 
                   groups = c("Group 1 vs. Group 2", "Group 2 vs. Group 3", "Group 3 vs. Group 4", "Group 4 vs. Group 5"))

expr_fc <- foldchanges %>% separate(transition, c("fc", "t0", "t1"), sep = "_", remove = FALSE) %>%
  left_join(dplyr::rename(mean_counts, t0 = age_group, counts_t0 = counts)) %>%
  left_join(dplyr::rename(mean_counts, t1 = age_group, counts_t1 = counts)) %>%
  subset(select = c(ensembl_id, gene, transition, counts_t0, counts_t1, log2FoldChange, abs_log2_fc, padj)) %>%
  left_join(mutate(p_0.05, selected = 'DE genes')) %>%
  mutate(selected = factor(replace_na(selected, 'Non DE genes'), levels = c("DE genes", "Non DE genes"))) %>%
  #arrange(desc(selected)) %>%
  left_join(groups)
## Joining, by = c("ensembl_id", "gene", "t0")
## Joining, by = c("ensembl_id", "gene", "t1")
## Joining, by = c("gene", "transition", "log2FoldChange", "abs_log2_fc")
## Joining, by = "transition"
tail(expr_fc)
##            ensembl_id    gene     transition counts_t0 counts_t1 log2FoldChange
## 59827 ENSG00000119632 IFI27L2 fc_27-60_61-85  5.659593  5.659593              0
## 59828 ENSG00000170270    GON7 fc_27-60_61-85  5.659593  5.659593              0
## 59829 ENSG00000204592   HLA-E fc_27-60_61-85  5.659593  5.659593              0
## 59830 ENSG00000223654   FLOT1 fc_27-60_61-85  5.659593  5.659593              0
## 59831 ENSG00000236428 PPP1R18 fc_27-60_61-85  5.659593  5.659593              0
## 59832 ENSG00000276023  DUSP14 fc_27-60_61-85  5.659593  5.659593              0
##       abs_log2_fc padj updown     selected              groups
## 59827           0    1   <NA> Non DE genes Group 3 vs. Group 4
## 59828           0    1   <NA> Non DE genes Group 3 vs. Group 4
## 59829           0    1   <NA> Non DE genes Group 3 vs. Group 4
## 59830           0    1   <NA> Non DE genes Group 3 vs. Group 4
## 59831           0    1   <NA> Non DE genes Group 3 vs. Group 4
## 59832           0    1   <NA> Non DE genes Group 3 vs. Group 4
group.colors <- c("Non DE genes" = "grey", "DE genes" = "red")
  
ggplot(expr_fc, aes(x = counts_t0 , y = counts_t1 , color = selected)) + 
  geom_point(size = 0.5, alpha = 0.6) + 
  facet_wrap(~groups, nrow = 1) + 
  theme_bw() + 
  geom_smooth(method='lm', color = 'black', size = 0.5) +
  scale_color_manual(values=group.colors) +
  xlab("Mean expression in the first age group") +
  ylab("Mean expression in \n the second age group") +
  theme(text = element_text(size = 20)) +
  xlim(0, 22) +
  ylim(0,22)  + 
  theme(legend.position="bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## `geom_smooth()` using formula = 'y ~ x'

Why are some of the selected genes so close to the black line?

expr_fc %>% 
  mutate(diff = abs((counts_t0 - counts_t1)/(counts_t1+0.1))) %>% 
  filter(selected == "True") %>% 
  arrange(diff) %>% head()
##  [1] ensembl_id     gene           transition     counts_t0      counts_t1     
##  [6] log2FoldChange abs_log2_fc    padj           updown         selected      
## [11] groups         diff          
## <0 Zeilen> (oder row.names mit Länge 0)
geneCounts <- plotCounts(dds, gene = "ENSG00000128606", intgroup = c("age_group"),
                         returnData = TRUE, normalized = TRUE)
ggplot(geneCounts, aes(x = age_group, y = count)) +
  #geom_beeswarm(cex = 3) +
  geom_violin(draw_quantiles = c(0.5), trim = TRUE) +
  theme_bw()

5.5 Volcano Plot

EnhancedVolcano(expr_fc,
  lab = expr_fc$gene,
  x = 'log2FoldChange',
  y = 'padj') +
  facet_wrap(~transition, scales = "free")

6 Hierarchical clustering to define the age groups

6.1 LASSO

df = data.frame(t(assay(gse))) %>%
  rownames_to_column(var = "sample") %>%
  left_join(rownames_to_column(subset(data.frame(colData(gse)), select = c("Age")), var = 'sample'))
## Joining, by = "sample"
model <- glmnet(subset(df, select = -c(sample, Age)), df$Age, alpha = 1, lambda = 1)
lasso <- data.frame(ensembl_id = dimnames(coef(model))[[1]], coefficient = matrix(coef(model))) %>%
  filter(coefficient != 0) %>%
  filter(ensembl_id != "(Intercept)") %>%
  arrange(desc(abs(coefficient))) 
lasso_plot <- lasso %>%
  # add corresponding gene names to the ensembl ids
  left_join(dplyr::rename(as_tibble(subset(rowData(gse), select = c(gene_id, gene_name))), 
                          'ensembl_id' = 'gene_id')) %>%
  filter(gene_name != "")
## Joining, by = "ensembl_id"
lasso_plot %<>% mutate(gene_name = factor(gene_name, levels = lasso_plot$gene_name)) 
write.csv(lasso_plot, paste0(dir, "Data/lasso_genes.csv"), row.names = FALSE)

ggplot(lasso_plot[1:25,], aes(x = gene_name, y = abs(coefficient))) +
  geom_bar(stat='identity') +
  theme_bw() +
  xlab('LASSO-selected genes') +
  ylab('Absolute LASSO \n regression coefficient') +
  theme(axis.text.x = element_text(angle = 90)) +
  theme(text = element_text(size = 18))

6.2 Hierarchical clustering on LASSO features

#lasso <- read.delim(paste0(dir, "MOFA/Data/lasso_coef.csv"), 
 #                                    sep = ",", header = TRUE)
# subset to features from LASSO regression
#vsd <- vsd[rownames(vsd) %in% lasso$ensembl_id,]

# use MEFISTO genes for clustering
#mefisto_pos <- read.delim(paste0(dir, "MOFA/Data/top_genes_pos_0.1.csv"), 
#                                     sep = ",", header = TRUE)
#mefisto_neg <- read.delim(paste0(dir, "MOFA/Data/top_genes_neg_0.1.csv"), 
#                                     sep = ",", header = TRUE)
#MEFISTO_genes <- bind_rows(mefisto_pos, mefisto_neg)
#
#vsd <- vsd[rowData(vsd)$gene_name %in% MEFISTO_genes$gene,]

sampleDists <- dist(t(assay(vsd[rownames(vsd) %in% lasso$ensembl_id,])))
sampleDistMatrix <- as.matrix(sampleDists)

annotation <- data.frame(subset(colData(vsd), select = c(age_group))) %>%
  rownames_to_column(var = "sample")
age_groups <- data.frame("age_group" = c("1-15", "16-26", "27-60", "61-85", "86-96"),
                         "Groups" = c("Group 1 (1-15y)", "Group 2 (16-26y)", "Group 3 (27-60y)", 
                                          "Group 4 (61-85y)", "Group 5 (86-96y)"))
annotation %<>% left_join(age_groups) %>% 
  column_to_rownames(var = "sample") %>%
  subset(select = c(Groups))
## Joining, by = "age_group"
#annotation <- pred_age %>% dplyr::rename(Age = predicted_age) %>%
#  left_join(age_groups) %>%
#  column_to_rownames(var = "names") %>%
#  subset(select = age_group)

colors <- colorRampPalette(rev(brewer.pal(9, "Reds")))(255)
#newCols <- colorRampPalette(grDevices::rainbow(5))
#annoCol <- newCols(5)
annoCol <- c('#ffd166', '#ef476f', '#06d6a0', '#118ab2', '#073b4c')
names(annoCol) <- c("Group 1 (1-15y)", "Group 2 (16-26y)", "Group 3 (27-60y)", "Group 4 (61-85y)", "Group 5 (86-96y)")
annoCol <- list(Groups = annoCol)

pheatmap(sampleDistMatrix,
         clustering_distance_rows = sampleDists,
         clustering_distance_cols = sampleDists, 
         annotation_col = annotation, annotation_colors = annoCol,
         clustering_method = "ward.D2",
         col = colors, show_colnames = F, show_rownames = F, fontsize = 16, annotation_names_col = FALSE) 

dend <- hclust(sampleDists, method = 'ward.D2')
plot(dend, labels = FALSE)

annotation <- data.frame(subset(colData(vsd), select = c(Age)))
#annotation <- pred_age %>% dplyr::rename(Age = predicted_age) %>%
#  left_join(age_groups) %>%
#  column_to_rownames(var = "names")

ordering <- data.frame('cluster' = c(1, 2, 3, 4, 5, 6), 
                       'ordered_cluster' = c(4, 2, 3, 5, 6, 1)) #to resort the clusters according to the covered age range

clusters <- data.frame(cluster = cutree(dend, 6)) %>%
  rownames_to_column() %>%
  #left_join(data.frame(subset(annotation, select = Age)) %>% rownames_to_column()) %>%
  left_join(annotation %>% rownames_to_column()) %>%
  mutate(Age = as.numeric(Age)) %>%
  left_join(ordering) %>%
  mutate(ordered_cluster = factor(ordered_cluster, levels = seq(1, 6, 1)))
## Joining, by = "rowname"
## Joining, by = "cluster"
ggplot(clusters, aes(x = ordered_cluster, y = Age)) +
  geom_boxplot(fill = c("#ffd166", "#ef476f", "#06d6a0", "#06d6a0", "#118ab2", "#073b4c")) +
  theme_bw() + 
  geom_hline(yintercept = c(15.5, 36.5, 60.5, 85.5), color="red", linetype="dashed")  +
  theme(text = element_text(size = 18)) +
  xlab("Hierarchical cluster")

ggplot(clusters, aes(x = cluster, y = Age, color = cluster)) +
  geom_point() +
  theme_bw() + 
  geom_hline(yintercept = c(15.5, 36.5, 60.5, 85.5))+
  theme(text = element_text(size = 20))

6.3 LASSO robustness

# LASSO regressions with varying penalty parameter lamda
model1 <- glmnet(subset(df, select = -c(sample, Age)), df$Age, alpha = 1, lambda = 0.25)
lasso1 <- data.frame(ensembl_id = dimnames(coef(model1))[[1]], coefficient = matrix(coef(model1))) %>%
  filter(coefficient != 0) %>%
  filter(ensembl_id != "(Intercept)") %>%
  mutate(lambda = 0.25)

model2 <- glmnet(subset(df, select = -c(sample, Age)), df$Age, alpha = 1, lambda = 0.5)
lasso2 <- data.frame(ensembl_id = dimnames(coef(model2))[[1]], coefficient = matrix(coef(model2))) %>%
  filter(coefficient != 0) %>%
  filter(ensembl_id != "(Intercept)") %>%
  mutate(lambda = 0.5)

model3 <- glmnet(subset(df, select = -c(sample, Age)), df$Age, alpha = 1, lambda = 2)
lasso3 <- data.frame(ensembl_id = dimnames(coef(model3))[[1]], coefficient = matrix(coef(model3))) %>%
  filter(coefficient != 0) %>%
  filter(ensembl_id != "(Intercept)") %>%
  mutate(lambda = 2)

model4 <- glmnet(subset(df, select = -c(sample, Age)), df$Age, alpha = 1, lambda = 4)
lasso4 <- data.frame(ensembl_id = dimnames(coef(model4))[[1]], coefficient = matrix(coef(model4))) %>%
  filter(coefficient != 0) %>%
  filter(ensembl_id != "(Intercept)") %>%
  mutate(lambda = 4)

all_models <- rbind(lasso1, lasso2, mutate(lasso, lambda = 1), lasso3, lasso4) %>%
  group_by(lambda) %>% 
  group_split()
names(all_models) <- c("Lambda = 0.25", "Lambda = 0.5", "Lambda = 1", "Lambda = 2", "Lambda = 4")
all_models <- lapply(all_models, function(list) list$ensembl_id)

upset(fromList(all_models), order.by = "freq", 
       mainbar.y.label = "Intersections", sets.x.label = "LASSO-selected genes",
       text.scale = c(4, 4, 3.5, 3.5, 4, 4), nintersects = 20,
       keep.order= TRUE, sets = c("Lambda = 4", "Lambda = 2", 
                                  "Lambda = 1", "Lambda = 0.5", "Lambda = 0.25"))

6.4 PCA on LASSO genes

# variance-stabilizing transformation
vsd <- vst(dds)
# filter for LASSO genes
keep <- rownames(vsd) %in% lasso$ensembl_id
vsd <- vsd[keep,]

# PCA
pca <- plotPCA(vsd, intgroup = c("age_group"), returnData = TRUE) 
age_groups <- data.frame("age_group" = c("1-15", "16-26", "27-60", "61-85", "86-96"),
                         "Groups" = c("Group 1", "Group 2", "Group 3", 
                                          "Group 4", "Group 5"))
pca %<>% left_join(age_groups)
## Joining, by = "age_group"
percentVar <- round(100 * attr(pca, "percentVar"))
ggplot(pca, aes(-PC1, PC2, color=Groups)) +
  geom_point(size=2) +
  xlab(paste0("- PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) +
  #coord_fixed() +
  theme_bw() +
  theme(text = element_text(size = 18)) +
  scale_colour_manual(values = c('#ffd166', '#ef476f', '#06d6a0', '#118ab2', '#073b4c')) + 
  labs(color='Age Groups') 

7 Differential expression youngest vs. oldest group

# calculate foldchanges between youngest and oldest age group
res <- results(dds, contrast=c("age_group", '86-96', '1-15'))
res$gene <- rowData(dds)$symbol
res <- subset(res, select = c(gene,  log2FoldChange, pvalue, padj)) %>%
  as.data.frame() %>%
  rownames_to_column(var = "ensembl_id") %>%
  filter(gene != "") %>%
  drop_na(padj) %>%
  filter(padj < 0.05) %>%
  mutate(transition = "fc_1-15_86-96")

# split DE genes for various comparisons into list to visualize the intersections
transitions <- c("fc_16-26_27-60", "fc_61-85_86-96", "fc_1-15_86-96")
top_list <- subset(p_0.05, select = c(gene, transition)) %>% 
  filter(transition %in% c("fc_16-26_27-60", "fc_61-85_86-96")) %>%
  bind_rows(subset(res, select = c(gene, transition))) %>%
  mutate(transition = factor(transition, levels = transitions)) %>%
  group_by(transition) %>% 
  group_split()
names(top_list) <- c("group2_vs_group3", "group4_vs_group5", "group1_vs_group5")
top_list <- lapply(top_list, function(list) list$gene)

upset(fromList(top_list), order.by = "freq", 
       mainbar.y.label = "Intersections", sets.x.label = "DE Genes",
       text.scale = c(2.5, 2.5, 2, 2, 2.5, 2.5), nintersects = 10,
       keep.order= TRUE, sets = c("group1_vs_group5", "group4_vs_group5", "group2_vs_group3"))

8 Gene length analysis

groups <- data.frame("group" = c("Upregulated DE genes", "Downregulated DE genes", "Non-DE genes", "Non-DE genes"),
                     "significant" = c("DE genes", "DE genes", "non DE genes", "non DE genes"), 
                     "direction" = c("upregulated", "downregulated", "upregulated", "downregulated"))

res <- results(dds, contrast=c("age_group", '86-96', '1-15'))
res$gene <- rowData(dds)$symbol
res <- subset(res, select = c(gene,  log2FoldChange, pvalue, padj)) %>%
  as.data.frame() %>%
  rownames_to_column(var = "ensembl_gene_id") %>%
  filter(gene != "") %>%
  drop_na(padj) %>%
  mutate(significant = ifelse(padj < 0.05, "DE genes", "non DE genes")) %>%
  mutate(direction = ifelse(log2FoldChange > 0, "upregulated", "downregulated")) %>%
  left_join(groups)
## Joining, by = c("significant", "direction")
# Query gene information from biomart
genes <- unique(res$ensembl_gene_id)
mart <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
gene_lengths <- getBM(filters="ensembl_gene_id", 
                          attributes=c("ensembl_gene_id", 
                                       "chromosome_name", "start_position", 
                                       "end_position", "external_gene_name"), 
                          values=c(genes), mart=mart) %>%
  data.frame() %>%
  group_by(ensembl_gene_id) %>%
  summarize(chr = dplyr::first(chromosome_name), start = dplyr::first(start_position), 
            end = dplyr::first(end_position), gene_symbol = dplyr::first(external_gene_name)) %>%
  mutate(gene_length = end - start) %>%
  drop_na(gene_length) %>%
  dplyr::filter(chr %in% as.character(seq(1,22,1))) %>%
  subset(select = c(ensembl_gene_id, gene_length)) %>%
  right_join(res)
## Joining, by = "ensembl_gene_id"
ggplot(gene_lengths, aes(x = group, y = log(gene_length), fill = group)) +
  geom_boxplot() +
  theme_bw() + 
  xlab("") + ylab("Logarithmic gene length (bp)") +
  theme(text = element_text(size = 22)) + 
  theme(axis.text.x = element_blank(), axis.ticks = element_blank()) +
  scale_fill_manual(values = c('#1565c0', '#999999', '#c62828'))
## Warning: Removed 1533 rows containing non-finite values (`stat_boxplot()`).

# T-test for downregulated genes
t.test(dplyr::filter(gene_lengths, group == "Downregulated DE genes")$gene_length, 
       dplyr::filter(gene_lengths, group == "Non-DE genes")$gene_length, 
       mu = 0, alternative = "less")
## 
##  Welch Two Sample t-test
## 
## data:  dplyr::filter(gene_lengths, group == "Downregulated DE genes")$gene_length and dplyr::filter(gene_lengths, group == "Non-DE genes")$gene_length
## t = -11.247, df = 9335.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##       -Inf -23191.84
## sample estimates:
## mean of x mean of y 
##  52805.58  79970.85
# T-test for upregulated genes
t.test(dplyr::filter(gene_lengths, group == "Upregulated DE genes")$gene_length, 
       dplyr::filter(gene_lengths, group == "Non-DE genes")$gene_length, 
       mu = 0, alternative = "greater")
## 
##  Welch Two Sample t-test
## 
## data:  dplyr::filter(gene_lengths, group == "Upregulated DE genes")$gene_length and dplyr::filter(gene_lengths, group == "Non-DE genes")$gene_length
## t = 4.7429, df = 8070.7, p-value = 1.071e-06
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  8997.297      Inf
## sample estimates:
## mean of x mean of y 
##  93745.94  79970.85
fc_length <- foldchanges %>%
  left_join(mutate(subset(foldchanges_filtered, select = c(ensembl_id, transition, log2FoldChange)), 
                   DE_group = ifelse(log2FoldChange > 0, "DE genes upregulated", "DE genes downregulated"))) %>%
  mutate(group = replace_na(DE_group, "non DE genes")) %>%
  #mutate(significant = ifelse(padj < 0.05, "DE genes", "non DE genes")) %>%
  dplyr::rename(ensembl_gene_id = ensembl_id) 
## Joining, by = c("ensembl_id", "transition", "log2FoldChange")
age_groups = data.frame(transition = c("fc_1-15_16-26", "fc_16-26_27-60", "fc_27-60_61-85", 
                 "fc_61-85_86-96"), 
                   age_transition = c("Group 1 vs. Group 2", "Group 2 vs. Group 3", "Group 3 vs. Group 4", "Group 4 vs. Group 5"))

# Query gene information from biomart
genes <- unique(fc_length$ensembl_gene_id)
mart <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
gene_lengths <- getBM(filters="ensembl_gene_id", 
                          attributes=c("ensembl_gene_id", 
                                       "chromosome_name", "start_position", 
                                       "end_position", "external_gene_name"), 
                          values=c(genes), mart=mart) %>%
  data.frame() %>%
  group_by(ensembl_gene_id) %>%
  summarize(chr = dplyr::first(chromosome_name), start = dplyr::first(start_position), 
            end = dplyr::first(end_position), gene_symbol = dplyr::first(external_gene_name)) %>%
  mutate(gene_length = end - start) %>%
  dplyr::filter(chr %in% as.character(seq(1,22,1))) %>%
  subset(select = c(ensembl_gene_id, gene_length)) %>%
  right_join(fc_length) %>%
  left_join(age_groups)
## Joining, by = "ensembl_gene_id"
## Joining, by = "transition"
ggplot(gene_lengths, aes(x = DE_group, y = log(gene_length), fill = DE_group)) +
  geom_boxplot() +
  facet_wrap(~age_transition) +
  theme_bw() + 
  xlab("") + ylab("log gene length") +
  theme(axis.text.x = element_blank(), axis.ticks = element_blank())
## Warning: Removed 6132 rows containing non-finite values (`stat_boxplot()`).

# T-test for downregulated genes
#t.test(dplyr::filter(gene_lengths, transition == "Group 4 vs. Group 5", DE_group == "DE genes downregulated")$gene_length, 
#       dplyr::filter(gene_lengths, transition == "Group 4 vs. Group 5", DE_group == "non DE genes")$gene_length, 
#       mu = 0, alternative = "less")

# T-test for upregulated genes
#t.test(dplyr::filter(gene_lengths, transition == "Group 4 vs. Group 5", DE_group == "DE genes upregulated")$gene_length, 
#       dplyr::filter(gene_lengths, transition == "Group 4 vs. Group 5", DE_group == "non DE genes")$gene_length, 
#       mu = 0, alternative = "greater")